1 Multiple Linear Regression for Wire Bond Strength Dataset

1.1 Loading Data and Visualization

Note: You must change the file paths in the read.csv() functions below to match the location of the files on your computer for example “C:<YourUsername>” on windows.

# Read data. Change the path as necessary.
# Example: bond.data <- read.csv("data/wire-bond.csv")
bond.data <- read.csv("wire-bond.csv")

# This will now be automatically rendered as a paged table
bond.data

2D Visualization

par(mfrow = c(1, 2), mar = c(5, 4, 2, 1))

# 1) length vs strength
i1 <- which(!is.na(bond.data$length) & !is.na(bond.data$strength))
plot(bond.data$length[i1], bond.data$strength[i1],
     xlab = "Wire Length", ylab = "Pull strength", pch = 19)
# label with original row indices (use seq_along(i1) if you prefer 1..n)
text(bond.data$length[i1], bond.data$strength[i1],
     labels = i1, pos = 1, offset = 0.4, cex = 0.75)

# 2) height vs strength
i2 <- which(!is.na(bond.data$height) & !is.na(bond.data$strength))
plot(bond.data$height[i2], bond.data$strength[i2],
     xlab = "Die height", ylab = "Pull strength", pch = 19)
text(bond.data$height[i2], bond.data$strength[i2],
     labels = i2, pos = 1, offset = 0.4, cex = 0.75)

3D Visualize

# 1. Install and load the required package

library(scatterplot3d)

par (mfrow = c(1,1))
# 2. Create the 3D scatterplot and store its object
s3d <- with(bond.data, scatterplot3d(
  x = length,
  y = height,
  z = strength,
  pch = 19,
  color = "steelblue",
  main = "3D Scatterplot: Strength vs. Length and Height",
  xlab = "Length",
  ylab = "Height",
  zlab = "Strength",
  angle = 60
))

# 3. Fit a linear model
fit <- lm(strength ~ length + height, data = bond.data)

# 4. Add the regression plane to the plot
s3d$plane3d(fit, lty.box = "solid")

Dynamic 3D Visualization of 3D Data

# 1. Load necessary libraries
library(plotly)
library(dplyr)

# NOTE: This script assumes you have a data frame named 'bond.data'
# with columns 'length', 'height', and 'strength' already loaded.

# 2. Create the initial scatter plot object
fig <- plot_ly(
  data = bond.data, 
  x = ~length, 
  y = ~height, 
  z = ~strength
) %>%
  add_markers(
    marker = list(
      color = ~strength,
      colorscale = 'Viridis',
      showscale = TRUE,
      colorbar = list(title = 'Strength')
    ),
    name = 'Data Points'
  ) %>%
  layout(
    title = "Interactive 3D Scatter Plot of Bond Strength",
    scene = list(
      xaxis = list(title = 'Length'),
      yaxis = list(title = 'Height'),
      zaxis = list(title = 'Strength')
    )
  )

# 3. Display the scatter plot
# In an interactive session (like RStudio), this line will render the plot.
fig

Add Fitted Surface

# 4. Fit a linear model to the data
fit <- lm(strength ~ length + height, data = bond.data)

# 5. Prepare data for the regression plane surface
resolution <- 20
axis_x <- seq(min(bond.data$length), max(bond.data$length), length.out = resolution)
axis_y <- seq(min(bond.data$height), max(bond.data$height), length.out = resolution)

z_matrix <- outer(
  axis_x,
  axis_y,
  function(len, ht) {
    predict(fit, newdata = data.frame(length = len, height = ht))
  }
)

# 6. Add the surface to the EXISTING plot object and display the result
fig <- fig %>%
  add_surface(
    x = ~axis_x,
    y = ~axis_y,
    z = ~z_matrix,
    colorscale = 'Blues',
    opacity = 0.7,
    showscale = FALSE,
    name = 'Regression Plane'
  )

# 7. Display the final, updated plot
fig

1.2 Model Fitting and Summary

We fit a multiple linear regression model with strength as the response variable and length and height as predictors.

# Fit the linear model
fit <- lm(strength ~ length + height, data = bond.data)

# Display the classic summary output (this is not a data frame, so it's unaffected)
summary(fit)
# 
# Call:
# lm(formula = strength ~ length + height, data = bond.data)
# 
# Residuals:
#    Min     1Q Median     3Q    Max 
# -3.865 -1.542 -0.362  1.196  5.841 
# 
# Coefficients:
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept) 2.263791   1.060066   2.136 0.044099 *  
# length      2.744270   0.093524  29.343  < 2e-16 ***
# height      0.012528   0.002798   4.477 0.000188 ***
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 
# Residual standard error: 2.288 on 22 degrees of freedom
# Multiple R-squared:  0.9811,  Adjusted R-squared:  0.9794 
# F-statistic: 572.2 on 2 and 22 DF,  p-value: < 2.2e-16

The summary provides the ANOVA F-test for overall significance, \(R^2\), adjusted \(R^2\), and t-tests for individual coefficients.

1.3 Confidence Intervals and Model Components

We can extract confidence intervals for the regression coefficients, as well as fitted values and residuals.

# Confidence intervals (matrix output, will be auto-paged)
confint(fit)
#                   2.5 %     97.5 %
# (Intercept) 0.065348613 4.46223426
# length      2.550313061 2.93822623
# height      0.006724246 0.01833138
# Fitted values and ordinary residuals (matrix output, will be auto-paged)
pred <- fitted.values(fit)
e <- resid(fit)
data.frame(y = bond.data$strength, y.hat = pred, e = e)
# Extract covariance matrix (matrix output, will be auto-paged)
cov.mat <- vcov(fit)
cov.mat
#              (Intercept)        length        height
# (Intercept)  1.123740429 -3.921612e-02 -1.781991e-03
# length      -0.039216122  8.746709e-03 -9.903775e-05
# height      -0.001781991 -9.903775e-05  7.831149e-06
# Standard errors (vector output, needs to be a data.frame to be paged)
data.frame(std.error = sqrt(diag(cov.mat)))

2 RSS-based Inference: F-test, and adjusted \(R^2\)

The General Linear Model

The general linear model is expressed in matrix form as:

\[y = X\beta + \epsilon\]

  • \(y\): An \(n \times 1\) vector of the observed response variable.
  • \(X\): An \(n \times p\) matrix, called the design matrix, where each column is a predictor variable. The first column is typically a vector of ones for the intercept.
  • \(\beta\): A \(p \times 1\) vector of the unknown model parameters (the regression coefficients).
  • \(\epsilon\): An \(n \times 1\) vector of random errors.

The goal of Ordinary Least Squares (OLS) is to find the estimate \(\hat{\beta}\) that minimizes the sum of squared errors.

Fundamental Sum of Squares

These three metrics form the basis of variance decomposition in a linear model.

  • SST (Total Sum of Squares): This is the total variation in the response variable (y). It measures the squared differences between the observed values and their mean. \[SST = \sum_{i=1}^n (y_i - \bar{y})^2\]

  • SSE (Sum of Squared Errors): Also called the Residual Sum of Squares (RSS), this is the variation in the response variable that is not explained by the model. It’s the sum of the squared differences between the observed values and the model’s predicted values. \[SSE = \sum_{i=1}^n (y_i - \hat{y}_i)^2\]

  • SSR (Sum of Squares due to Regression): This is the variation in the response variable that is explained by the model. It’s the sum of the squared differences between the model’s predicted values and the mean of the response. \[SSR = \sum_{i=1}^n (\hat{y}_i - \bar{y})^2\]

These three quantities are related by the identity: SST = SSR + SSE.

Mean Squares (Variance Estimates)

Mean squares are created by dividing a sum of squares by its corresponding degrees of freedom, which turns them into average or “mean” squares that can be interpreted as variance estimates.

  • MST (Mean Squared Total): This is the sample variance of the response variable y. \[MST = \frac{SST}{n-1}\]

  • MSE (Mean Squared Error): This is the average squared prediction error, representing the unexplained variance after accounting for the number of parameters in the model. \[MSE = \frac{SSE}{n-p}\]

Adjusted Sum of Squares (\(SSR_{adj}\))

This is the special “penalized” version of the Sum of Squares Regression that we developed in our previous discussions. It subtracts a penalty from the standard SSR based on the model’s complexity and error.

\[SSR_{adj} = SSR - k \cdot MSE\]

  • \(k\) is the number of predictors (\(k = p-1\)).

Key Model Statistics

Using the terms defined above, we can now describe the F-statistic and adjusted R-squared.

F-statistic

The F-statistic tests the overall significance of the model by comparing the variance explained by the model to the unexplained variance. It is the ratio of the Mean Square Regression (MSR) to the Mean Squared Error (MSE).

\[F = \frac{MSR}{MSE} \quad \text{where} \quad MSR = \frac{SSR}{p-1}\]

Adjusted R-squared (\(R^2_{adj}\))

The Adjusted R-squared measures the proportion of variance explained by the model while penalizing for complexity. It can be expressed in two primary ways:

  1. Using MSE and MST: \[R^2_{adj} = 1 - \frac{MSE}{MST}\]

  2. Using our special \(SSR_{adj}\) term: \[R^2_{adj} = \frac{SSR_{adj}}{SST}\]

2.1 A Simulation Study to Understand the Distributions of RSS

The simulation’s purpose is to show how model evaluation metrics perform when predictors, which have no actual relationship with the outcome, are added to a model. This illustrates the concepts of overfitting and penalizing for model complexity.

Data Generating Model

The data is generated from a linear model where the response variable, y, is purely random noise. The specific model used to generate the n=30 observations is:

\[y_i = \beta_0 + \beta_1 x_{i1} + \beta_2 x_{i2} + \dots + \beta_{20} x_{i,20} + \epsilon_i\]

The parameters are set as follows:

  • Coefficients (\(\beta\)):

  • When \(H_0\) is true, \[\beta_0 = 0, \beta_1 = 0, \dots, \beta_{20} = 0\]

  • When \(H_1\) is true, \[\beta_0 = 0, \beta_1 = 1, \beta_2 = 0, \dots, \beta_{20} = 0\]

  • Predictors (\(x\)): The 20 predictor variables (\(x_1, \dots, x_{20}\)) are drawn from a standard normal distribution (mean=0, sd=1).

  • Error Term (\(\epsilon\)): The errors, \(\epsilon_i\), are drawn from a normal distribution with a mean of 0 and a standard deviation of \(\sigma=1\).

Because all \(\beta\) coefficients are zero, the true data generating model simplifies to \(y_i = \epsilon_i\). This means there is no underlying signal for the regression models to find; any apparent relationships are due purely to random chance.

Sequence of Fitted Models

The simulation fits a sequence of nested ordinary least squares (OLS) regression models to the randomly generated data. The models increase in complexity as shown in the table below.

Model Name # of Predictors (k) # of Parameters (p) R Formula
Model 0 0 1 y ~ 1
Model 1 2 3 y ~ x_1 + x_2
Final Model 20 21 y ~ x_1 + ... + x_20

For each model in this sequence, the simulation calculates and plots the key statistics (SSE, \(R^2\), \(R^2_{adj}\), F-statistic) against the number of parameters (p) to visualize how they change as the model’s complexity increases.

2.1.1 When \(H_0\) is true

library(ggplot2)
library(patchwork)

# -----------------------------------------------------------
# Setup and Data Generation (same as before)
# -----------------------------------------------------------
N_sim <- ifelse(interactive(), 1, 50)
n <- 30
p_max <- 20
beta <- rep(0, p_max)
sigma <- 1
x_lim <- c(0, n)
ks <- seq(0, p_max, by = 2)

for (j in 1:N_sim) {
#set.seed(42)
X <- replicate(p_max, rnorm(n))
colnames(X) <- paste0("x", 1:p_max)
X <- as.data.frame(X)

y <- as.numeric(as.matrix(X) %*% beta + rnorm(n, sd = sigma))

stats_list <- lapply(ks, function(k) {
  num_params <- k + 1
  if (k == 0) {
    fitsim <- lm(y ~ 1)
  } else {
    fml <- as.formula(
      paste("y ~", paste(paste0("x", 1:k), collapse = " + "))
    )
    fitsim <- lm(fml, data = X)
  }
  
  s <- summary(fitsim)
  f_val <- s$fstatistic
  
  data.frame(
    p = num_params,
    RSS = sum(residuals(fitsim)^2),
    R2 = ifelse(k == 0, 0, s$r.squared),
    R2_adj = ifelse(k == 0, 0, s$adj.r.squared),
    F_stat = ifelse(is.null(f_val), NA, f_val[1])
  )
})

df <- do.call(rbind, stats_list)
saturated_row <- data.frame(p = n, RSS = 0, R2 = 1, R2_adj = NA, F_stat = NA)
df <- rbind(df, saturated_row)


# --- Plotting Logic ---

# Plot 1: RSS vs. Model Size (no changes here)
g1 <- ggplot(df, aes(x = p, y = RSS)) +
  geom_line(color = "black") +
  geom_point(color = "black") +
  geom_point(data = subset(df, p == n), size = 2) +
  annotate("text", x = n, y = 0, label = "(n, 0)", vjust = -0.8, size = 3) +
  scale_x_continuous(limits = x_lim, breaks = pretty(x_lim)) +
  labs(
    title = "Residual Sum of Squares vs. Number of Parameters",
    x = "Number of parameters (p)",
    y = "Residual Sum of Squares (RSS)"
  ) +
  theme_minimal(base_size = 13)

# Plot 2: F-statistic and R-squared values (with updated labels)
scale_factor <- max(df$F_stat, na.rm = TRUE)

g2 <- ggplot(df, aes(x = p)) +
  geom_line(aes(y = F_stat, color = "F-statistic")) +
  geom_point(aes(y = F_stat, color = "F-statistic")) +
  geom_line(aes(y = R2 * scale_factor, color = "R2")) +
  geom_point(aes(y = R2 * scale_factor, color = "R2")) +
  geom_line(aes(y = R2_adj * scale_factor, color = "R2_adj"), linetype = "dashed") +
  geom_point(aes(y = R2_adj * scale_factor, color = "R2_adj")) +
  
  scale_x_continuous(limits = x_lim, breaks = pretty(x_lim)) +
  scale_y_continuous(
    name = "F-statistic",
    # UPDATED: Use expression() for the axis title
    sec.axis = sec_axis(~ . / scale_factor, name = expression(paste(R^2, " / ", R[adj]^2)), labels = scales::percent)
  ) +
  # UPDATED: Use expression() in the labels for the legend
  scale_color_manual(
    name = "Metric",
    values = c("F-statistic" = "firebrick", "R2" = "blue", "R2_adj" = "cyan4"),
    labels = c("F-statistic", expression(R^2), expression(R[adj]^2))
  ) +
  labs(
    title = "Model Fit Statistics vs. Number of Parameters",
    x = "Number of parameters (p)"
  ) +
  theme_minimal(base_size = 13) +
  theme(legend.position = "top")


# Combine and print the plots
print(g1 / g2)

}

2.1.2 When \(H_1\) is true

library(ggplot2)
library(patchwork)

# -----------------------------------------------------------
# Setup and Data Generation (same as before)
# -----------------------------------------------------------
N_sim <- ifelse(interactive(), 1, 50)
n <- 30
p_max <- 20
beta <- rep(0, p_max)
beta [1] <- 1.5
sigma <- 1
x_lim <- c(0, n)
ks <- seq(0, p_max, by = 2)

for (j in 1:N_sim) {
#set.seed(42)
X <- replicate(p_max, rnorm(n))
colnames(X) <- paste0("x", 1:p_max)
X <- as.data.frame(X)

y <- as.numeric(as.matrix(X) %*% beta + rnorm(n, sd = sigma))

stats_list <- lapply(ks, function(k) {
  num_params <- k + 1
  if (k == 0) {
    fitsim <- lm(y ~ 1)
  } else {
    fml <- as.formula(
      paste("y ~", paste(paste0("x", 1:k), collapse = " + "))
    )
    fitsim <- lm(fml, data = X)
  }
  
  s <- summary(fitsim)
  f_val <- s$fstatistic
  
  data.frame(
    p = num_params,
    RSS = sum(residuals(fitsim)^2),
    R2 = ifelse(k == 0, 0, s$r.squared),
    R2_adj = ifelse(k == 0, 0, s$adj.r.squared),
    F_stat = ifelse(is.null(f_val), NA, f_val[1])
  )
})

df <- do.call(rbind, stats_list)
saturated_row <- data.frame(p = n, RSS = 0, R2 = 1, R2_adj = NA, F_stat = NA)
df <- rbind(df, saturated_row)


# --- Plotting Logic ---

# Plot 1: RSS vs. Model Size (no changes here)
g1 <- ggplot(df, aes(x = p, y = RSS)) +
  geom_line(color = "black") +
  geom_point(color = "black") +
  geom_point(data = subset(df, p == n), size = 2) +
  annotate("text", x = n, y = 0, label = "(n, 0)", vjust = -0.8, size = 3) +
  scale_x_continuous(limits = x_lim, breaks = pretty(x_lim)) +
  labs(
    title = "Residual Sum of Squares vs. Number of Parameters",
    x = "Number of parameters (p)",
    y = "Residual Sum of Squares (RSS)"
  ) +
  theme_minimal(base_size = 13)

# Plot 2: F-statistic and R-squared values (with updated labels)
scale_factor <- max(df$F_stat, na.rm = TRUE)

g2 <- ggplot(df, aes(x = p)) +
  geom_line(aes(y = F_stat, color = "F-statistic")) +
  geom_point(aes(y = F_stat, color = "F-statistic")) +
  geom_line(aes(y = R2 * scale_factor, color = "R2")) +
  geom_point(aes(y = R2 * scale_factor, color = "R2")) +
  geom_line(aes(y = R2_adj * scale_factor, color = "R2_adj"), linetype = "dashed") +
  geom_point(aes(y = R2_adj * scale_factor, color = "R2_adj")) +
  
  scale_x_continuous(limits = x_lim, breaks = pretty(x_lim)) +
  scale_y_continuous(
    name = "F-statistic",
    # UPDATED: Use expression() for the axis title
    sec.axis = sec_axis(~ . / scale_factor, name = expression(paste(R^2, " / ", R[adj]^2)), labels = scales::percent)
  ) +
  # UPDATED: Use expression() in the labels for the legend
  scale_color_manual(
    name = "Metric",
    values = c("F-statistic" = "firebrick", "R2" = "blue", "R2_adj" = "cyan4"),
    labels = c("F-statistic", expression(R^2), expression(R[adj]^2))
  ) +
  labs(
    title = "Model Fit Statistics vs. Number of Parameters",
    x = "Number of parameters (p)"
  ) +
  theme_minimal(base_size = 13) +
  theme(legend.position = "top")


# Combine and print the plots
print(g1 / g2)

}

2.2 Example

This section uses data on the weight, height, and age of children to demonstrate partial F-tests.

# Data: Weight, height and age of children
wgt <- c(64, 71, 53, 67, 55, 58, 77, 57, 56, 51, 76, 68)
hgt <- c(57, 59, 49, 62, 51, 50, 55, 48, 42, 42, 61, 57)
age <- c(8, 10, 6, 11, 8, 7, 10, 9, 10, 6, 12, 9)
child.data <- data.frame(wgt, hgt, age)

2.2.1 Problem 1: Height then Age

Here we test if adding age significantly improves the model that already contains hgt.

fit_age_hgt <- lm(wgt ~ hgt + age, data = child.data)
summary(fit_age_hgt)
# 
# Call:
# lm(formula = wgt ~ hgt + age, data = child.data)
# 
# Residuals:
#     Min      1Q  Median      3Q     Max 
# -6.8708 -1.7004  0.3454  1.4642 10.2336 
# 
# Coefficients:
#             Estimate Std. Error t value Pr(>|t|)  
# (Intercept)   6.5530    10.9448   0.599   0.5641  
# hgt           0.7220     0.2608   2.768   0.0218 *
# age           2.0501     0.9372   2.187   0.0565 .
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 
# Residual standard error: 4.66 on 9 degrees of freedom
# Multiple R-squared:   0.78,   Adjusted R-squared:  0.7311 
# F-statistic: 15.95 on 2 and 9 DF,  p-value: 0.001099
# Compare the nested models (anova output is a data frame, will be auto-paged)
fit_hgt <- lm(wgt ~ hgt, data = child.data)
summary(fit_hgt)
# 
# Call:
# lm(formula = wgt ~ hgt, data = child.data)
# 
# Residuals:
#     Min      1Q  Median      3Q     Max 
# -5.8736 -3.8973 -0.4402  2.2624 11.8375 
# 
# Coefficients:
#             Estimate Std. Error t value Pr(>|t|)   
# (Intercept)   6.1898    12.8487   0.482  0.64035   
# hgt           1.0722     0.2417   4.436  0.00126 **
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 
# Residual standard error: 5.471 on 10 degrees of freedom
# Multiple R-squared:  0.663,   Adjusted R-squared:  0.6293 
# F-statistic: 19.67 on 1 and 10 DF,  p-value: 0.001263
anova(fit_hgt, fit_age_hgt)

2.2.2 Problem 2: Age then Height

Here we test if adding age significantly improves the model that already contains hgt.

fit_age <- lm(wgt ~  age, data = child.data)
summary(fit_age)
# 
# Call:
# lm(formula = wgt ~ age, data = child.data)
# 
# Residuals:
#     Min      1Q  Median      3Q     Max 
# -11.000  -3.911   1.143   4.071  10.000 
# 
# Coefficients:
#             Estimate Std. Error t value Pr(>|t|)   
# (Intercept)  30.5714     8.6137   3.549  0.00528 **
# age           3.6429     0.9551   3.814  0.00341 **
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 
# Residual standard error: 6.015 on 10 degrees of freedom
# Multiple R-squared:  0.5926,  Adjusted R-squared:  0.5519 
# F-statistic: 14.55 on 1 and 10 DF,  p-value: 0.003407
# Compare the nested models (anova output is a data frame, will be auto-paged)
fit_age_hgt <- lm(wgt ~ age+hgt, data = child.data)
summary(fit_age_hgt)
# 
# Call:
# lm(formula = wgt ~ age + hgt, data = child.data)
# 
# Residuals:
#     Min      1Q  Median      3Q     Max 
# -6.8708 -1.7004  0.3454  1.4642 10.2336 
# 
# Coefficients:
#             Estimate Std. Error t value Pr(>|t|)  
# (Intercept)   6.5530    10.9448   0.599   0.5641  
# age           2.0501     0.9372   2.187   0.0565 .
# hgt           0.7220     0.2608   2.768   0.0218 *
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 
# Residual standard error: 4.66 on 9 degrees of freedom
# Multiple R-squared:   0.78,   Adjusted R-squared:  0.7311 
# F-statistic: 15.95 on 2 and 9 DF,  p-value: 0.001099
anova(fit_age, fit_age_hgt)

2.3 Relationship between t-test and partial F-test

The partial F-test is a statistical test used to determine if adding one or more covariates to a regression model provides a statistically significant improvement in explaining the dependent variable’s variance. It works by comparing a full model (with the new covariates) to a reduced model (without them).

A t-test for an individual coefficient is simply a special case of the partial F-test. In this scenario, the reduced model is the one where the specific covariate being tested has been omitted. The mathematical relationship is direct: the squared value of the t-statistic for that single coefficient is identical to the F-statistic from the partial F-test.

Therefore, the p-value of the t-test for an individual coefficient is about the statistical significance of that variable’s unique contribution after accounting for (or removing) the effects of all other covariates already in the model. It essentially asks, “Does this specific variable add any significant explanatory power, given that the other variables are already included?” This is why a variable might be significant on its own but not in a multiple regression model if its effect is already captured by other predictors.

3 Predictions for Mean Response and a Future Observation

This section demonstrates how to calculate confidence and prediction intervals for the wire bond strength data.

3.1 Confidence Interval for Mean Response

Construct a 95% CI on the mean pull strength for a wire bond with wire length = 8 and die height = 275.

# We use the model 'fit' from the first section (predict output is a matrix)
predict(fit, newdata = data.frame(length = 8, height = 275),
        interval = "confidence", level = 0.95)
#       fit      lwr      upr
# 1 27.6631 26.66324 28.66296

3.2 Prediction Interval for a New Observation

Construct a 95% PI on the pull strength for a new wire bond with wire length = 8 and die height = 275.

predict(fit, newdata = data.frame(length = 8, height = 275),
        interval = "prediction", level = 0.95)
#       fit      lwr      upr
# 1 27.6631 22.81378 32.51241

4 Model Diagnostics

This section covers various methods for diagnosing the model fit using residuals and other measures.

4.1 Residual Calculations

We can calculate hat values, and various types of residuals. They are combined here into a single data frame which will be auto-paged.

residuals_df <- data.frame(
  hat_values = hatvalues(fit),
  ordinary_resid = resid(fit),
  standardized_resid = resid(fit) / sigma(fit),
  studentized_internal = rstandard(fit),
  studentized_external = rstudent(fit)
)
residuals_df

4.2 Residual Plots

A common practice is to analyze plots of the studentized residuals to check for non-linearity, non-constant variance, and non-normality.

# Residual analysis using internally studentized residuals
n <- nrow(bond.data)
r <- rstandard(fit) 
y.hat <- fitted.values(fit)

par(mfrow = c(2, 3))
qqnorm(r, main = "Normal Q-Q Plot")
qqline(r)
plot(y.hat, r, xlab = "Fitted values", ylab = "Studentized Residuals")
abline(h = 0)
plot(1:n, r, xlab = "Observation Number", ylab = "Studentized Residuals")
abline(h = 0)
plot(bond.data$length, r, xlab = "Wire Length", ylab = "Studentized Residuals")
abline(h = 0)
plot(bond.data$height, r, xlab = "Die Height", ylab = "Studentized Residuals")
abline(h = 0)

5 Influential Observations

We can identify influential data points using metrics like DFFITS, DFBETAS, and Cook’s D.

# Find DFFITS, DFBETAS and Cook's D for the wire bond strength data.
influence_df <- data.frame(dffits = dffits(fit), cook.D = cooks.distance(fit), dfbetas(fit))
influence_df

5.1 Plotting with the olsrr Package

The olsrr package provides convenient functions for plotting these influence diagnostics. Note: You only need to run install.packages("olsrr") once.

# install.packages("olsrr") # Run this once if you don't have the package
library(olsrr)

# Plot for Cook's D
ols_plot_cooksd_chart(fit)

# Plot for DFFITS
ols_plot_dffits(fit)

# Plot for DFBETAS
ols_plot_dfbetas(fit)

6 Polynomial Regression

This example uses data on the cost and production lot size of airplane sidewall panels.

y <- c(1.81, 1.70, 1.65, 1.55, 1.48, 1.40, 1.30, 1.26, 1.24, 1.21, 1.20, 1.18)
x <- c(20, 25, 30, 35, 40, 50, 60, 65, 70, 75, 80, 90)
fit_poly <- lm(y ~ x + I(x^2))
summary(fit_poly)
# 
# Call:
# lm(formula = y ~ x + I(x^2))
# 
# Residuals:
#        Min         1Q     Median         3Q        Max 
# -0.0174763 -0.0065087  0.0001297  0.0071482  0.0151887 
# 
# Coefficients:
#               Estimate Std. Error t value Pr(>|t|)    
# (Intercept)  2.198e+00  2.255e-02   97.48 6.38e-15 ***
# x           -2.252e-02  9.424e-04  -23.90 1.88e-09 ***
# I(x^2)       1.251e-04  8.658e-06   14.45 1.56e-07 ***
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 
# Residual standard error: 0.01219 on 9 degrees of freedom
# Multiple R-squared:  0.9975,  Adjusted R-squared:  0.9969 
# F-statistic:  1767 on 2 and 9 DF,  p-value: 2.096e-12

The plot shows the fitted quadratic curve.

plot(x, y, xlab = "Lot size, x", ylab = "Average cost per unit, y")
lines(x, predict(fit_poly, newdata = data.frame(x = x)), type = "l")

We can use a partial F-test to see if the quadratic term significantly improves the model.

fit1 <- lm(y ~ x)
anova(fit1, fit_poly)

7 Indicator Variables

This section uses the SBP (Systolic Blood Pressure) dataset to model the effect of a categorical variable (sex).

# Note: Update this path to your local file location
sbpdata <- read.csv("sbpdata.csv")

# Fit the full model with an interaction term
fit.full <- lm(sbp ~ age + sex + age:sex, data = sbpdata)
summary(fit.full)
# 
# Call:
# lm(formula = sbp ~ age + sex + age:sex, data = sbpdata)
# 
# Residuals:
#     Min      1Q  Median      3Q     Max 
# -20.647  -3.410   1.254   4.314  21.153 
# 
# Coefficients:
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept) 97.07708    5.17046  18.775  < 2e-16 ***
# age          0.94932    0.10864   8.738 1.43e-12 ***
# sex         12.96144    7.01172   1.849   0.0691 .  
# age:sex      0.01203    0.14519   0.083   0.9342    
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 
# Residual standard error: 8.946 on 65 degrees of freedom
# Multiple R-squared:  0.7759,  Adjusted R-squared:  0.7656 
# F-statistic: 75.02 on 3 and 65 DF,  p-value: < 2.2e-16

7.1 Hypothesis Tests for Indicator Models

We can perform various hypothesis tests to see if the lines are coincident, parallel, or have equal intercepts.

# Test of Coincidence. H0: beta2=beta3=0
fit.coin <- lm(sbp ~ age, data = sbpdata)
anova(fit.coin, fit.full)
# Test of Parallelism. H0: beta3=0
fit.para <- lm(sbp ~ age + sex, data = sbpdata)
anova(fit.para, fit.full)
# Test for Equal Intercepts. H0: beta2=0 (assuming parallel lines)
fit1.inter <- lm(sbp ~ age + sex, data = sbpdata)
fit2.inter <- lm(sbp ~ age, data = sbpdata)
anova(fit2.inter, fit1.inter)

7.2 Plotting the Fitted Model

The plot below shows the separate regression lines for males and females.

# Define color for males (sex=1) and females (sex=0).
colors <- ifelse(sbpdata$sex == 1, "black", "gray")

# Scatter plot
plot(sbpdata$age, sbpdata$sbp, xlab = "Age", ylab = "SBP", col = colors)

# Add the fitted lines for males and females
# The design matrix is data.frame(1, age, 1, age) for sex = 1,
# and data.frame(1, age, 0, 0) for sex = 0.
lines(sbpdata$age, cbind(1, sbpdata$age, 1, sbpdata$age) %*% coef(fit.full), col = "black")
lines(sbpdata$age, cbind(1, sbpdata$age, 0, 0) %*% coef(fit.full), col = "gray")

# Add legend
legend("topleft", legend = c("sex = 1 (male)", "sex = 0 (female)"), 
       lty = c(1, 1), col = c("black", "gray"))

8 Model Building

This section uses the olsrr package and wine quality data to demonstrate automated model selection techniques.

library(olsrr)
# Note: Update this path to your local file location
wine <- read.csv("wine.csv")

# A dot in the lm function means: use all other variables as predictors.
model <- lm(quality ~ ., data = wine)

8.1 All Possible Regression

This method evaluates every possible combination of predictors. The output is a data frame and will be auto-paged.

ols_step_best_subset(model)
#              Best Subsets Regression             
# -------------------------------------------------
# Model Index    Predictors
# -------------------------------------------------
#      1         flavor                             
#      2         flavor oakiness                    
#      3         aroma flavor oakiness              
#      4         clarity aroma flavor oakiness      
#      5         clarity aroma body flavor oakiness 
# -------------------------------------------------
# 
#                                                   Subsets Regression Summary                                                   
# -------------------------------------------------------------------------------------------------------------------------------
#                        Adj.        Pred                                                                                         
# Model    R-Square    R-Square    R-Square     C(p)       AIC        SBIC        SBC        MSEP       FPE       HSP       APC  
# -------------------------------------------------------------------------------------------------------------------------------
#   1        0.6242      0.6137      0.5868    9.0436    130.0214    21.6859    134.9341    61.4102    1.7010    0.0462    0.4176 
#   2        0.6611      0.6417      0.6058    6.8132    128.0901    20.1242    134.6404    57.0033    1.6171    0.0441    0.3970 
#   3        0.7038      0.6776      0.6379    3.9278    124.9781    18.0702    133.1661    51.3383    1.4906    0.0409    0.3659 
#   4        0.7147      0.6801      0.6102    4.6747    125.5480    19.2854    135.3736    50.9872    1.5143    0.0418    0.3717 
#   5        0.7206      0.6769       0.587    6.0000    126.7552    21.0956    138.2183    51.5452    1.5649    0.0436    0.3842 
# -------------------------------------------------------------------------------------------------------------------------------
# AIC: Akaike Information Criteria 
#  SBIC: Sawa's Bayesian Information Criteria 
#  SBC: Schwarz Bayesian Criteria 
#  MSEP: Estimated error of prediction, assuming multivariate normality 
#  FPE: Final Prediction Error 
#  HSP: Hocking's Sp 
#  APC: Amemiya Prediction Criteria

8.2 Automated Stepwise Procedures

We can also use backward, forward, or stepwise selection to find a good model. These functions also return data frames.

# Backward Elimination (alpha_out = 0.1)
ols_step_backward_p(model, p_val = 0.1)
# 
# 
#                              Stepwise Summary                             
# ------------------------------------------------------------------------
# Step    Variable        AIC        SBC       SBIC       R2       Adj. R2 
# ------------------------------------------------------------------------
#  0      Full Model    126.755    138.218    21.096    0.72060    0.67694 
#  1      body          125.548    135.374    19.285    0.71471    0.68013 
#  2      clarity       124.978    133.166    18.070    0.70377    0.67763 
# ------------------------------------------------------------------------
# 
# Final Model Output 
# ------------------
# 
#                          Model Summary                          
# ---------------------------------------------------------------
# R                       0.839       RMSE                 1.098 
# R-Squared               0.704       MSE                  1.207 
# Adj. R-Squared          0.678       Coef. Var            9.338 
# Pred R-Squared          0.638       AIC                124.978 
# MAE                     0.868       SBC                133.166 
# ---------------------------------------------------------------
#  RMSE: Root Mean Square Error 
#  MSE: Mean Square Error 
#  MAE: Mean Absolute Error 
#  AIC: Akaike Information Criteria 
#  SBC: Schwarz Bayesian Criteria 
# 
#                                ANOVA                                
# -------------------------------------------------------------------
#                Sum of                                              
#               Squares        DF    Mean Square      F         Sig. 
# -------------------------------------------------------------------
# Regression    108.935         3         36.312    26.925    0.0000 
# Residual       45.853        34          1.349                     
# Total         154.788        37                                    
# -------------------------------------------------------------------
# 
#                                   Parameter Estimates                                    
# ----------------------------------------------------------------------------------------
#       model      Beta    Std. Error    Std. Beta      t        Sig      lower     upper 
# ----------------------------------------------------------------------------------------
# (Intercept)     6.467         1.333                  4.852    0.000     3.759     9.176 
#       aroma     0.580         0.262        0.307     2.213    0.034     0.047     1.113 
#      flavor     1.200         0.275        0.603     4.364    0.000     0.641     1.758 
#    oakiness    -0.602         0.264       -0.217    -2.278    0.029    -1.140    -0.065 
# ----------------------------------------------------------------------------------------
# Forward Selection (alpha_in = 0.1)
ols_step_forward_p(model, p_val = 0.1)
# 
# 
#                              Stepwise Summary                             
# ------------------------------------------------------------------------
# Step    Variable        AIC        SBC       SBIC       R2       Adj. R2 
# ------------------------------------------------------------------------
#  0      Base Model    165.209    168.484    55.141    0.00000    0.00000 
#  1      flavor        130.021    134.934    21.686    0.62417    0.61373 
#  2      oakiness      128.090    134.640    20.124    0.66111    0.64175 
#  3      aroma         124.978    133.166    18.070    0.70377    0.67763 
# ------------------------------------------------------------------------
# 
# Final Model Output 
# ------------------
# 
#                          Model Summary                          
# ---------------------------------------------------------------
# R                       0.839       RMSE                 1.098 
# R-Squared               0.704       MSE                  1.207 
# Adj. R-Squared          0.678       Coef. Var            9.338 
# Pred R-Squared          0.638       AIC                124.978 
# MAE                     0.868       SBC                133.166 
# ---------------------------------------------------------------
#  RMSE: Root Mean Square Error 
#  MSE: Mean Square Error 
#  MAE: Mean Absolute Error 
#  AIC: Akaike Information Criteria 
#  SBC: Schwarz Bayesian Criteria 
# 
#                                ANOVA                                
# -------------------------------------------------------------------
#                Sum of                                              
#               Squares        DF    Mean Square      F         Sig. 
# -------------------------------------------------------------------
# Regression    108.935         3         36.312    26.925    0.0000 
# Residual       45.853        34          1.349                     
# Total         154.788        37                                    
# -------------------------------------------------------------------
# 
#                                   Parameter Estimates                                    
# ----------------------------------------------------------------------------------------
#       model      Beta    Std. Error    Std. Beta      t        Sig      lower     upper 
# ----------------------------------------------------------------------------------------
# (Intercept)     6.467         1.333                  4.852    0.000     3.759     9.176 
#      flavor     1.200         0.275        0.603     4.364    0.000     0.641     1.758 
#    oakiness    -0.602         0.264       -0.217    -2.278    0.029    -1.140    -0.065 
#       aroma     0.580         0.262        0.307     2.213    0.034     0.047     1.113 
# ----------------------------------------------------------------------------------------
# Stepwise Regression (alpha_in = 0.1, alpha_out = 0.1)
ols_step_both_p(model, p_enter = 0.1, p_remove = 0.1)
# 
# 
#                               Stepwise Summary                              
# --------------------------------------------------------------------------
# Step    Variable          AIC        SBC       SBIC       R2       Adj. R2 
# --------------------------------------------------------------------------
#  0      Base Model      165.209    168.484    55.141    0.00000    0.00000 
#  1      flavor (+)      130.021    134.934    21.686    0.62417    0.61373 
#  2      oakiness (+)    128.090    134.640    20.124    0.66111    0.64175 
#  3      aroma (+)       124.978    133.166    18.070    0.70377    0.67763 
# --------------------------------------------------------------------------
# 
# Final Model Output 
# ------------------
# 
#                          Model Summary                          
# ---------------------------------------------------------------
# R                       0.839       RMSE                 1.098 
# R-Squared               0.704       MSE                  1.207 
# Adj. R-Squared          0.678       Coef. Var            9.338 
# Pred R-Squared          0.638       AIC                124.978 
# MAE                     0.868       SBC                133.166 
# ---------------------------------------------------------------
#  RMSE: Root Mean Square Error 
#  MSE: Mean Square Error 
#  MAE: Mean Absolute Error 
#  AIC: Akaike Information Criteria 
#  SBC: Schwarz Bayesian Criteria 
# 
#                                ANOVA                                
# -------------------------------------------------------------------
#                Sum of                                              
#               Squares        DF    Mean Square      F         Sig. 
# -------------------------------------------------------------------
# Regression    108.935         3         36.312    26.925    0.0000 
# Residual       45.853        34          1.349                     
# Total         154.788        37                                    
# -------------------------------------------------------------------
# 
#                                   Parameter Estimates                                    
# ----------------------------------------------------------------------------------------
#       model      Beta    Std. Error    Std. Beta      t        Sig      lower     upper 
# ----------------------------------------------------------------------------------------
# (Intercept)     6.467         1.333                  4.852    0.000     3.759     9.176 
#      flavor     1.200         0.275        0.603     4.364    0.000     0.641     1.758 
#    oakiness    -0.602         0.264       -0.217    -2.278    0.029    -1.140    -0.065 
#       aroma     0.580         0.262        0.307     2.213    0.034     0.047     1.113 
# ----------------------------------------------------------------------------------------

9 Multicollinearity

This section explores the issue of multicollinearity, where predictors are highly correlated.

9.1 A Simple Example

y <- c(19, 20, 37, 39, 36, 38)
x1 <- c(4, 4, 7, 7, 7.1, 7.1)
x2 <- c(16, 16, 49, 49, 50.4, 50.4)
cor(data.frame(x1, x2))
#           x1        x2
# x1 1.0000000 0.9999713
# x2 0.9999713 1.0000000
# Full model
fit_multi <- lm(y ~ x1 + x2)
summary(fit_multi)
# 
# Call:
# lm(formula = y ~ x1 + x2)
# 
# Residuals:
#    1    2    3    4    5    6 
# -0.5  0.5 -1.0  1.0 -1.0  1.0 
# 
# Coefficients:
#             Estimate Std. Error t value Pr(>|t|)
# (Intercept) -156.056    117.158  -1.332    0.275
# x1            65.444     45.890   1.426    0.249
# x2            -5.389      4.152  -1.298    0.285
# 
# Residual standard error: 1.225 on 3 degrees of freedom
# Multiple R-squared:  0.9897,  Adjusted R-squared:  0.9829 
# F-statistic: 144.3 on 2 and 3 DF,  p-value: 0.001043
# Simple model
fit1_multi <- lm(y ~ x1)
summary(fit1_multi)
# 
# Call:
# lm(formula = y ~ x1)
# 
# Residuals:
#       1       2       3       4       5       6 
# -0.5260  0.4740 -0.1925  1.8075 -1.7814  0.2186 
# 
# Coefficients:
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept)  -4.0293     2.3332  -1.727    0.159    
# x1            5.8888     0.3762  15.654 9.73e-05 ***
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 
# Residual standard error: 1.325 on 4 degrees of freedom
# Multiple R-squared:  0.9839,  Adjusted R-squared:  0.9799 
# F-statistic: 245.1 on 1 and 4 DF,  p-value: 9.725e-05

9.2 VIFs in the Wine Quality Data

We can calculate the Variance Inflation Factor (VIF) for each predictor to diagnose multicollinearity. A VIF > 10 is often considered problematic.

wine.x <- wine[, -ncol(wine)] # Assuming quality is the last column
cor(wine.x)
#              clarity     aroma       body      flavor  oakiness
# clarity   1.00000000 0.0619021 -0.3083783 -0.08515993 0.1832147
# aroma     0.06190210 1.0000000  0.5489102  0.73656121 0.2016444
# body     -0.30837826 0.5489102  1.0000000  0.64665917 0.1521059
# flavor   -0.08515993 0.7365612  0.6466592  1.00000000 0.1797605
# oakiness  0.18321471 0.2016444  0.1521059  0.17976051 1.0000000
# Find VIF's using the olsrr package (returns a data frame).
ols_vif_tol(model)